4. Main Analysis
# plot rating distribution fill by district(postcode)
ggplot(data = food, aes(food$Rating, fill=factor(food$zip_code))) +
geom_histogram(aes(x= food$Rating,y = (..count..)/sum(..count..)),
breaks=c(0.0,0.5,1.0,1.5,2.0,2.5,3.0,3.5,4.0,4.5,5.0),
position = 'dodge') +
labs(x="Rating", y="Density", fill="Zip Code")
We can see that the resturant’s rating in district 10036 has a peak around 3.5. And the resturant’s rating in distrct 10019 has a peak around 3. Both of the 10036 and 10019 distributions are left skewed.
## For price, there is no much difference in 10019 and 10036, both of the districts have 2 price resturants.
#ggplot(data = food, aes(food$Price, fill=factor(food$Postcode))) + geom_histogram(aes(x= food$Price), binwidth = 1,position = 'dodge')
ggplot(data = food, aes(food$Price, fill=factor(food$zip_code))) +
geom_histogram(aes(x= food$Price,y = ..density..),binwidth = 1,position = 'dodge') +
labs(x="Price", fill="Zip Code")
The above graph shows that most of the restaurants in the area have Price level 2. District 10019 and 10036 seem to have similar price distribution pattern.
And we can see that 10011, 10012, 10033 only have 1 retuarant in that district, so the density will always be 1.
## plot the review based on postcode.
ggplot(data = food, aes(food$Review_Count, fill=factor(food$zip_code))) +
geom_histogram(position = 'dodge',binwidth = 50) +
xlim(0, 800) +
labs(x="Review Count", fill="Zip Code")
## we draw some boxplot to explore the data.
## we find that restuarnts in 10036 are more likely to have more reviews.
ggplot(food, aes(factor(food$zip_code), food$Review_Count)) +
geom_boxplot() +
coord_flip() +
labs(x="Zip Code", y="Review Count")
For review count, we find that restuarnts in 10036 are more likely to have more reviews. Restaurants located at 10019 comes the second. The restuarant in 10018 are more sparse.
## hex plot or scatter plot
ggplot(food,aes(x = Rating,y = Review_Count))+stat_bin_hex()+labs(y="Review Count")
ggplot(food,aes(x = Rating,y = Review_Count))+geom_point(position = 'jitter', alpha = .3)+labs(y="Review Count")
From the plot we can see that,
there is a cluster around rating 3~4 and review less that 500.
the more review, there is more likely to be high ratings.
There is no resturant have more than 600 review that has low ratings.
The resturant having over 1,000 reviews are outliers and they are all rating 3.5 or 4.0
# we explore the relationship between price and review
ggplot(food,aes(x = Price,y = Review_Count))+stat_bin_hex()+labs(y="Review Count")
ggplot(food,aes(x = Price,y = Review_Count))+geom_point(position = 'jitter', alpha = .3)+labs(y="Review Count")
There is a cluster around Price 1-2 and review below 500.
Outliers are the points over 1000 reviews and over 3 ratings.
data_new<-subset(food,food$zip_code %in% c(10036,10019,10018,10020))
data_new<- subset(data_new,data_new$Category_2nd_Level %in% c('North American','Deli','Europe','Asian'))
counts3 <- data_new %>% drop_na(`zip_code`,`Category_2nd_Level`)%>%group_by(data_new$`Category_2nd_Level`, data_new$`zip_code`) %>% summarize(Freq =n())
colnames(counts3)<-c('Category_data','zip_code','Freq')
factor_cat <- factor(counts3$Category_data)
vcd::mosaic(factor_cat~zip_code, direction = c('v','h'), counts3, rot_labels=c(0,90,0,0),
offset_varnames = c(left = 5), offset_labels = c(left = 0),
just_labels = c("center","right"), margins = c(left = 5), set_varname=c(zip_code="Zip Code", factor_cat="Category"))
From the plot we can see that the majority data lies in 10019 and 10036. And district will influence the retuarants category. We can see that there are more North American restuarants in 10036, and relatively less other types retuarants.
Interactive Parallel Coordinate
library(GGally)
library(ggplot2)
library(tidyverse)
parallel_data <-food[c(4,5,6,7,8,9,11)]
parallel_data$Category_data<-factor(parallel_data$Category_data)
#parallel_data$zip_code<-factor(parallel_data$zip_code)
parallel_data$Category_2nd_Level<-factor(parallel_data$Category_2nd_Level)
parallel_data$Price<-factor(parallel_data$Price)
#ggparcoord(parallel_data ,alphaLines = .7, groupColumn = 'Category_2nd_Level', scale = "uniminmax")+ylab('Data')+xlab('Indicator')
#ggparcoord(parallel_data ,alphaLines = .7, groupColumn = 'Category_2nd_Level', scale = "globalminmax")+ylab('Data')+xlab('Indicator')
#ggparcoord(parallel_data ,alphaLines = .7, groupColumn = 'Category_2nd_Level', scale = "robust")+ylab('Data')+xlab('Indicator')
#ggparcoord(parallel_data ,alphaLines = .7, groupColumn = 'Category_2nd_Level', scale = "std")+ylab('Data')+xlab('Indicator')
food_pl <- as.data.frame(food[, c(2,4,5,6,7,8,9,10,11)])
food_pl$Score <- round(food_pl$Score)
food_pl$Rating <- round(food_pl$Rating)
food_pl[, 1:9] <- lapply(food_pl[, 1:9], factor)
food_plna <- na.omit(food_pl, cols=c("Score"))
colnames(food_plna)
## [1] "ID" "Category_data" "Category_2nd_Level"
## [4] "Rating" "Review_Count" "Price"
## [7] "Street_Num" "zip_code" "Score"
food_al <- food_plna %>%
#drop_na(Score) %>%
group_by(Category_2nd_Level, Rating, Price, Street_Num, zip_code, Review_Count,Category_data) %>% #
summarise(Freq = n())
library(alluvial)
pal <- RColorBrewer::brewer.pal(10, "Set3")
alluvial(food_al[, c("Category_2nd_Level", "Rating", "Price", "zip_code")], freq = food_al$Freq,
blocks = TRUE,
alpha = 0.8,
col = pal[match(food_al$Category_2nd_Level,
unique(food_al$Category_2nd_Level)) ])
The majority of the restaurants belongs to North American food, and then comes European style restaurants and Deli. Most of the North American and European restaurants have ratings at 4 and above, and most of those restaurants are not too expensive having ratings at level 2 (around 20 to 30 dollars per person per meal). Asian restaurants comes to the forth biggest group in our data. Their ratings range from 2 to 4. More than half of the Asian are ranked at level 4. Approximately, 95% of the Asian restaurants set the price levels within two dollar signs. Coffee shops are usually with low ratings and low cost. Steakhouse are all ranked high and the price level are also high. More than 50% of the restaurants are located in zip code 10036 area. Their categories are mainly North American, European, coffee shop, Asian food, and steakhouse.